library(plotly)
library(plyr)
pa <- read.csv("https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/c4b7331e-e213-45a5-adda-052e4dd31d41/download/data2021-full.csv",
colClasses=c("ZIPCODE"="character"))
# Get only desired columns
cols <- c("PID", "CITY", "LU_DESC", "OWN_OCC", "LIVING_AREA", "TOTAL_VALUE", "YR_BUILT", "EXT_COND", "BED_RMS", "FULL_BTH")
pa <- pa[, cols]
# Get only "SINGLE FAM DWELLING"
pa <- pa[pa$LU_DESC=="SINGLE FAM DWELLING",]
# Convert string "$ddd,ddd.dd" to numeric
pa$TOTAL_VALUE <- as.numeric(gsub('[$,]', '', pa$TOTAL_VALUE))
# Map values for OWN_OCC
pa$OWN_OCC <- mapvalues(pa$OWN_OCC,
from=c("Y", "N"),
to=c("Yes", "No"))
summary(pa)## PID CITY LU_DESC OWN_OCC
## Min. :1.000e+08 Length:30502 Length:30502 Length:30502
## 1st Qu.:1.501e+09 Class :character Class :character Class :character
## Median :1.807e+09 Mode :character Mode :character Mode :character
## Mean :1.592e+09
## 3rd Qu.:2.003e+09
## Max. :2.206e+09
##
## LIVING_AREA TOTAL_VALUE YR_BUILT EXT_COND
## Min. : 168 Min. : 128200 Min. :1710 Length:30502
## 1st Qu.: 1312 1st Qu.: 429300 1st Qu.:1900 Class :character
## Median : 1600 Median : 537200 Median :1925 Mode :character
## Mean : 1768 Mean : 696676 Mean :1928
## 3rd Qu.: 2021 3rd Qu.: 705800 3rd Qu.:1950
## Max. :21710 Max. :27977700 Max. :2020
## NA's :1 NA's :6
## BED_RMS FULL_BTH
## Min. : 1.000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.: 1.000
## Median : 3.000 Median : 1.000
## Mean : 3.404 Mean : 1.482
## 3rd Qu.: 4.000 3rd Qu.: 2.000
## Max. :12.000 Max. :12.000
## NA's :3 NA's :2
# Categorical variable: OWN_OCC (Dawn)
own_occ <- table(pa$OWN_OCC)
prop.table(own_occ)*100##
## No Yes
## 16.52679 83.47321
plot_ly(x = names(own_occ),
y = as.numeric(own_occ),
type = "bar"
) %>%
layout(title = "Owner Occupied",
xaxis = list(categoryorder = "category descending"),
yaxis = list(title = "Single Family Dwellings")
)# Numerical variable: FULL_BTH (Dawn)
full_bth <- table(pa$FULL_BTH)
full_bth_names <- as.numeric(names(full_bth))
plot_ly(x = full_bth_names,
y = as.numeric(full_bth),
type = "bar"
)%>%
layout(title = "Full Bathrooms",
xaxis = list(title = "Bathrooms",
tickvals = seq(1:max(full_bth_names))),
yaxis = list(title = "Single Family Dwellings")
)